About the project

I’m expect to learn how to do Rstudio with huge data. In addition, I want to know about how to visualize the data properly.

Link: https://github.com/swinesci/IODS-project

I have a problem with chapter 2

Write a short description about the course and add a link to your GitHub repository here. This is an R Markdown (.Rmd) file so you can use R Markdown syntax.


Excercise 2

to see the data

df <- read.table("http://s3.amazonaws.com/assets.datacamp.com/production/course_2218/datasets/learning2014.txt", sep=",", header=TRUE)

to see structure and dimmension

str(df)
## 'data.frame':    166 obs. of  7 variables:
##  $ gender  : Factor w/ 2 levels "F","M": 1 2 1 2 2 1 2 1 2 1 ...
##  $ age     : int  53 55 49 53 49 38 50 37 37 42 ...
##  $ attitude: num  3.7 3.1 2.5 3.5 3.7 3.8 3.5 2.9 3.8 2.1 ...
##  $ deep    : num  3.58 2.92 3.5 3.5 3.67 ...
##  $ stra    : num  3.38 2.75 3.62 3.12 3.62 ...
##  $ surf    : num  2.58 3.17 2.25 2.25 2.83 ...
##  $ points  : int  25 12 24 10 22 21 21 31 24 26 ...
dim(df)
## [1] 166   7

to acess the ggplot 2 library

library(ggplot2)
ggplot(df)

pairs(df[-1])

## to access the GGally and ggplot2

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(ggplot2)

to see summary

summary(df)
##  gender       age           attitude          deep            stra      
##  F:110   Min.   :17.00   Min.   :1.400   Min.   :1.583   Min.   :1.250  
##  M: 56   1st Qu.:21.00   1st Qu.:2.600   1st Qu.:3.333   1st Qu.:2.625  
##          Median :22.00   Median :3.200   Median :3.667   Median :3.188  
##          Mean   :25.51   Mean   :3.143   Mean   :3.680   Mean   :3.121  
##          3rd Qu.:27.00   3rd Qu.:3.700   3rd Qu.:4.083   3rd Qu.:3.625  
##          Max.   :55.00   Max.   :5.000   Max.   :4.917   Max.   :5.000  
##       surf           points     
##  Min.   :1.583   Min.   : 7.00  
##  1st Qu.:2.417   1st Qu.:19.00  
##  Median :2.833   Median :23.00  
##  Mean   :2.787   Mean   :22.72  
##  3rd Qu.:3.167   3rd Qu.:27.75  
##  Max.   :4.333   Max.   :33.00

create a more advanced plot matrix with ggpairs

p <- ggpairs(df, maaping = aes(col = gender, alpha = 0.3), lower = list(combo = wrap("facethist", bins = 20)))
## Warning in warn_if_args_exist(list(...)): Extra arguments: "maaping" are
## being ignored. If these are meant to be aesthetics, submit them using the
## 'mapping' variable within ggpairs with ggplot2::aes or ggplot2::aes_string.

to draw the plot

p

##The three explanatory variables are “attitude”, “strategy” and “surface”. These are choosen based on r value

to see scatter plot

qplot(attitude, points, data = df) + geom_smooth(method = "lm")

## fit a linear model

model <- lm(points ~1, data = df)

summary of the model

summary(model)
## 
## Call:
## lm(formula = points ~ 1, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.7169  -3.7169   0.2831   5.0331  10.2831 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  22.7169     0.4575   49.65   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.895 on 165 degrees of freedom

to create an plot matrix

library(GGally)
ggpairs(df, lower = list(combo = wrap("facethist", bins = 20)))

to create a regression model with multibple explanatory variables

model2<- lm(points ~ attitude + stra + surf, data = df)
summary(model2)
## 
## Call:
## lm(formula = points ~ attitude + stra + surf, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.1550  -3.4346   0.5156   3.6401  10.8952 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  11.0171     3.6837   2.991  0.00322 ** 
## attitude      3.3952     0.5741   5.913 1.93e-08 ***
## stra          0.8531     0.5416   1.575  0.11716    
## surf         -0.5861     0.8014  -0.731  0.46563    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.296 on 162 degrees of freedom
## Multiple R-squared:  0.2074, Adjusted R-squared:  0.1927 
## F-statistic: 14.13 on 3 and 162 DF,  p-value: 3.156e-08

remove the surf because its p value was the highest

model3<- lm(points ~attitude + stra, data=df)
summary(model3)
## 
## Call:
## lm(formula = points ~ attitude + stra, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.6436  -3.3113   0.5575   3.7928  10.9295 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8.9729     2.3959   3.745  0.00025 ***
## attitude      3.4658     0.5652   6.132 6.31e-09 ***
## stra          0.9137     0.5345   1.709  0.08927 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.289 on 163 degrees of freedom
## Multiple R-squared:  0.2048, Adjusted R-squared:  0.1951 
## F-statistic: 20.99 on 2 and 163 DF,  p-value: 7.734e-09

the most significant variable was attitude. Stra had only tendency

to draw diagnostic plots

par(mfrow = c(2,2))
plot(model3, which = c(1,2,5))

QQ plot showed this model is normally distributed all looks good


Excercise 3

alc <- read.csv("Z:\\IODS-project\\data\\alc.csv", sep = ",")

access the tidyverse libraries tidyr, dplyr, ggplot2

library(tidyr); library(dplyr); library(ggplot2)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:GGally':
## 
##     nasa
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

glimpse at the alc data

glimpse(alc) 
## Observations: 382
## Variables: 36
## $ X          <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
## $ school     <fct> GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP,...
## $ sex        <fct> F, F, F, F, F, M, M, F, M, M, F, F, M, M, M, F, F, ...
## $ age        <int> 18, 17, 15, 15, 16, 16, 16, 17, 15, 15, 15, 15, 15,...
## $ address    <fct> U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, ...
## $ famsize    <fct> GT3, GT3, LE3, GT3, GT3, LE3, LE3, GT3, LE3, GT3, G...
## $ Pstatus    <fct> A, T, T, T, T, T, T, A, A, T, T, T, T, T, A, T, T, ...
## $ Medu       <int> 4, 1, 1, 4, 3, 4, 2, 4, 3, 3, 4, 2, 4, 4, 2, 4, 4, ...
## $ Fedu       <int> 4, 1, 1, 2, 3, 3, 2, 4, 2, 4, 4, 1, 4, 3, 2, 4, 4, ...
## $ Mjob       <fct> at_home, at_home, at_home, health, other, services,...
## $ Fjob       <fct> teacher, other, other, services, other, other, othe...
## $ reason     <fct> course, course, other, home, home, reputation, home...
## $ nursery    <fct> yes, no, yes, yes, yes, yes, yes, yes, yes, yes, ye...
## $ internet   <fct> no, yes, yes, yes, no, yes, yes, no, yes, yes, yes,...
## $ guardian   <fct> mother, father, mother, mother, father, mother, mot...
## $ traveltime <int> 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 1, 2, 1, 1, 1, ...
## $ studytime  <int> 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 1, 2, 3, 1, 3, ...
## $ failures   <int> 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ schoolsup  <fct> yes, no, yes, no, no, no, no, yes, no, no, no, no, ...
## $ famsup     <fct> no, yes, no, yes, yes, yes, no, yes, yes, yes, yes,...
## $ paid       <fct> no, no, yes, yes, yes, yes, no, no, yes, yes, yes, ...
## $ activities <fct> no, no, no, yes, no, yes, no, no, no, yes, no, yes,...
## $ higher     <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, y...
## $ romantic   <fct> no, no, no, yes, no, no, no, no, no, no, no, no, no...
## $ famrel     <int> 4, 5, 4, 3, 4, 5, 4, 4, 4, 5, 3, 5, 4, 5, 4, 4, 3, ...
## $ freetime   <int> 3, 3, 3, 2, 3, 4, 4, 1, 2, 5, 3, 2, 3, 4, 5, 4, 2, ...
## $ goout      <int> 4, 3, 2, 2, 2, 2, 4, 4, 2, 1, 3, 2, 3, 3, 2, 4, 3, ...
## $ Dalc       <int> 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ Walc       <int> 1, 1, 3, 1, 2, 2, 1, 1, 1, 1, 2, 1, 3, 2, 1, 2, 2, ...
## $ health     <int> 3, 3, 3, 5, 5, 5, 3, 1, 1, 5, 2, 4, 5, 3, 3, 2, 2, ...
## $ absences   <int> 5, 3, 8, 1, 2, 8, 0, 4, 0, 0, 1, 2, 1, 1, 0, 5, 8, ...
## $ G1         <int> 2, 7, 10, 14, 8, 14, 12, 8, 16, 13, 12, 10, 13, 11,...
## $ G2         <int> 8, 8, 10, 14, 12, 14, 12, 9, 17, 14, 11, 12, 14, 11...
## $ G3         <int> 8, 8, 11, 14, 12, 14, 12, 10, 18, 14, 12, 12, 13, 1...
## $ alc_use    <dbl> 1.0, 1.0, 2.5, 1.0, 1.5, 1.5, 1.0, 1.0, 1.0, 1.0, 1...
## $ high_use   <lgl> FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FAL...

use gather() to gather columns into key-value pairs and then glimpse() at the resulting data

gather(alc) %>% glimpse
## Warning: attributes are not identical across measure variables;
## they will be dropped
## Observations: 13,752
## Variables: 2
## $ key   <chr> "X", "X", "X", "X", "X", "X", "X", "X", "X", "X", "X", "...
## $ value <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11",...

draw a bar plot of each variable

gather(alc) %>% ggplot(aes(value)) + facet_wrap("key", scales = "free") + geom_bar()
## Warning: attributes are not identical across measure variables;
## they will be dropped

access the tidyverse libraries dplyr and ggplot2

library(dplyr)
library(ggplot2)

produce summary statistics by group

alc %>% group_by(sex) %>% summarise(count = n())
## # A tibble: 2 x 2
##   sex   count
##   <fct> <int>
## 1 F       198
## 2 M       184

library(ggplot2)

initialize a plot of high_use and G3

g1 <- ggplot(alc, aes(x = high_use, y = G3, col = sex))

define the plot as a boxplot and draw it

g1 + geom_boxplot() + ylab("grade")

initialise a plot of high_use and absences

g2 <- ggplot(alc, aes(x = high_use, y = absences, col = sex))

define the plot as a boxplot and draw it

g2 + geom_boxplot() + ggtitle("Student absences by alcohol consumption and sex")

find the model with glm()

m <- glm(high_use ~ failures + absences + sex, data = alc, family = "binomial")

alc and dlyr are available

find the model with glm()

m <- glm(high_use ~ failures + absences + sex, data = alc, family = "binomial")

compute odds ratios (OR)

OR <- coef(m) %>% exp

compute confidence intervals (CI)

CI <- confint(m) %>% exp
## Waiting for profiling to be done...

fit the model

m <- glm(high_use ~ failures + absences + sex, data = alc, family = "binomial")

predict() the probability of high_use

probabilities <- predict(m, type = "response")

add the predicted probabilities to ‘alc’

alc <- mutate(alc, probability = probabilities)

use the probabilities to make a prediction of high_use

alc <- mutate(alc, prediction = probability > 0.5)

see the last ten original classes, predicted probabilities, and class predictions

select(alc, failures, absences, sex, high_use, probability, prediction) %>% tail(10)
##     failures absences sex high_use probability prediction
## 373        1        0   M    FALSE   0.3749639      FALSE
## 374        1        7   M     TRUE   0.5353311       TRUE
## 375        0        1   F    FALSE   0.1406689      FALSE
## 376        0        6   F    FALSE   0.2069112      FALSE
## 377        1        2   F    FALSE   0.2199932      FALSE
## 378        0        2   F    FALSE   0.1523192      FALSE
## 379        2        2   F    FALSE   0.3068503      FALSE
## 380        0        3   F    FALSE   0.1647495      FALSE
## 381        0        4   M     TRUE   0.3568828      FALSE
## 382        0        2   M     TRUE   0.3153209      FALSE

tabulate the target variable versus the predictions

table(high_use = alc$high_use, prediction = alc$prediction)
##         prediction
## high_use FALSE TRUE
##    FALSE   259    9
##    TRUE     84   30

access dplyr and ggplot2

library(dplyr); library(ggplot2)

initialize a plot of ‘high_use’ versus ‘probability’ in ‘alc’

g <- ggplot(alc, aes(x = probability, y = high_use, col = prediction))

define the geom as points and draw the plot

g + geom_point()

tabulate the target variable versus the predictions

table(high_use = alc$high_use, prediction = alc$prediction) %>% prop.table %>% addmargins
##         prediction
## high_use      FALSE       TRUE        Sum
##    FALSE 0.67801047 0.02356021 0.70157068
##    TRUE  0.21989529 0.07853403 0.29842932
##    Sum   0.89790576 0.10209424 1.00000000

the logistic regression model m and dataset alc with predictions are available

define a loss function (average prediction error)

loss_func <- function(class, prob) {
  n_wrong <- abs(class - prob) > 0.5
  mean(n_wrong)
}

call loss_func to compute the average number of wrong predictions in the (training) data

loss_func(class = alc$high_use, prob = alc$probability)
## [1] 0.2434555

the logistic regression model m and dataset alc (with predictions) are available

define a loss function (average prediction error)

loss_func <- function(class, prob) {
  n_wrong <- abs(class - prob) > 0.5
  mean(n_wrong)
}

compute the average number of wrong predictions in the (training) data

loss_func(class = alc$high_use, prob = alc$probability)
## [1] 0.2434555

K-fold cross-validation

library(boot)
cv <- cv.glm(data = alc, cost = loss_func, glmfit = m, K = 10)

average number of wrong predictions in the cross validation

cv$delta[1]
## [1] 0.2460733

Excercise 4

access the MASS package

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select

load the data

data("Boston")

explore the dataset

str(Boston)
## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ black  : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
summary(Boston)
##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08204   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          black       
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   :  0.32  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38  
##  Median : 5.000   Median :330.0   Median :19.05   Median :391.44  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :356.67  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :396.90  
##      lstat            medv      
##  Min.   : 1.73   Min.   : 5.00  
##  1st Qu.: 6.95   1st Qu.:17.02  
##  Median :11.36   Median :21.20  
##  Mean   :12.65   Mean   :22.53  
##  3rd Qu.:16.95   3rd Qu.:25.00  
##  Max.   :37.97   Max.   :50.00
library(tidyr)
library(corrplot)
## corrplot 0.84 loaded
library(dplyr)

calculate the correlation matrix and round it

cor_matrix<-cor(Boston) %>% round(digits = 2 )

visualize the correlation matrix

corrplot(cor_matrix, method="circle", type="upper", cl.pos="b", tl.pos="d", tl.cex = 0.6)

It seems that “dis” is closely related to “age”, “nox”, “indus” and “zn”.

center and standardize variables

boston_scaled <- scale(Boston)

summaries of the scaled variables

summary(boston_scaled)
##       crim                 zn               indus        
##  Min.   :-0.419367   Min.   :-0.48724   Min.   :-1.5563  
##  1st Qu.:-0.410563   1st Qu.:-0.48724   1st Qu.:-0.8668  
##  Median :-0.390280   Median :-0.48724   Median :-0.2109  
##  Mean   : 0.000000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.007389   3rd Qu.: 0.04872   3rd Qu.: 1.0150  
##  Max.   : 9.924110   Max.   : 3.80047   Max.   : 2.4202  
##       chas              nox                rm               age         
##  Min.   :-0.2723   Min.   :-1.4644   Min.   :-3.8764   Min.   :-2.3331  
##  1st Qu.:-0.2723   1st Qu.:-0.9121   1st Qu.:-0.5681   1st Qu.:-0.8366  
##  Median :-0.2723   Median :-0.1441   Median :-0.1084   Median : 0.3171  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.:-0.2723   3rd Qu.: 0.5981   3rd Qu.: 0.4823   3rd Qu.: 0.9059  
##  Max.   : 3.6648   Max.   : 2.7296   Max.   : 3.5515   Max.   : 1.1164  
##       dis               rad               tax             ptratio       
##  Min.   :-1.2658   Min.   :-0.9819   Min.   :-1.3127   Min.   :-2.7047  
##  1st Qu.:-0.8049   1st Qu.:-0.6373   1st Qu.:-0.7668   1st Qu.:-0.4876  
##  Median :-0.2790   Median :-0.5225   Median :-0.4642   Median : 0.2746  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.6617   3rd Qu.: 1.6596   3rd Qu.: 1.5294   3rd Qu.: 0.8058  
##  Max.   : 3.9566   Max.   : 1.6596   Max.   : 1.7964   Max.   : 1.6372  
##      black             lstat              medv        
##  Min.   :-3.9033   Min.   :-1.5296   Min.   :-1.9063  
##  1st Qu.: 0.2049   1st Qu.:-0.7986   1st Qu.:-0.5989  
##  Median : 0.3808   Median :-0.1811   Median :-0.1449  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.4332   3rd Qu.: 0.6024   3rd Qu.: 0.2683  
##  Max.   : 0.4406   Max.   : 3.5453   Max.   : 2.9865

class of the boston_scaled object

class(boston_scaled)
## [1] "matrix"

change the object to data frame

boston_scaled <- as.data.frame(boston_scaled)

summary of the scaled crime rate

summary(boston_scaled$crim)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -0.419367 -0.410563 -0.390280  0.000000  0.007389  9.924110

create a quantile vector of crim and print it

bins <- quantile(boston_scaled$crim)
bins
##           0%          25%          50%          75%         100% 
## -0.419366929 -0.410563278 -0.390280295  0.007389247  9.924109610

create a categorical variable ‘crime’

crime <- cut(boston_scaled$crim, breaks = bins, include.lowest = TRUE, labels = c("low", "med_low", "med_high", "high"))

look at the table of the new factor crime

table(crime)
## crime
##      low  med_low med_high     high 
##      127      126      126      127

remove original crim from the dataset

boston_scaled <- dplyr::select(boston_scaled, -crim)

add the new categorical value to scaled data

boston_scaled <- data.frame(boston_scaled, crime)

number of rows in the Boston dataset

n <- nrow(Boston)

choose randomly 80% of the rows

ind <- sample(n,  size = n * 0.8)

create train set

train <- boston_scaled[ind,]

create test set

test <- boston_scaled[-ind,]

save the correct classes from test data

correct_classes <- test$crime

remove the crime variable from test data

test <- dplyr::select(test, -crime)

linear discriminant analysis

lda.fit <- lda(crime ~ ., data = train)

the function for lda biplot arrows

lda.arrows <- function(x, myscale = 1, arrow_heads = 0.1, color = "orange", tex = 0.75, choices = c(1,2)){
  heads <- coef(x)
  arrows(x0 = 0, y0 = 0, 
         x1 = myscale * heads[,choices[1]], 
         y1 = myscale * heads[,choices[2]], col=color, length = arrow_heads)
  text(myscale * heads[,choices], labels = row.names(heads), 
       cex = tex, col=color, pos=3)
}

target classes as numeric

classes <- as.numeric(train$crime)

plot the lda results

plot(lda.fit, dimen = 2, col = classes, pch = classes)
lda.arrows(lda.fit, myscale = 1)

predict classes with test data

lda.pred <- predict(lda.fit, newdata = test)

cross tabulate the results

table(correct = correct_classes, predicted = lda.pred$class)
##           predicted
## correct    low med_low med_high high
##   low       14      12        0    0
##   med_low    6      20        4    0
##   med_high   0       8       14    1
##   high       0       0        0   23

load MASS and Boston

library(MASS)
data('Boston')

center and standardize variables

boston_scaled <- scale(Boston)

change the object to data frame from matrix type.

boston_scaled <- as.data.frame(boston_scaled)

Calculate the Euclidean distances between observations.

dist_eu <- dist(boston_scaled)

look at the summary of the distances

summary(dist_eu)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1343  3.4625  4.8241  4.9111  6.1863 14.3970

K-means clustering

km <-kmeans(boston_scaled, centers = 3)

plot the Boston dataset with clusters

pairs(boston_scaled, col = km$cluster)

Boston dataset is available

set.seed(123)

determine the number of clusters

k_max <- 10

calculate the total within sum of squares

twcss <- sapply(1:k_max, function(k){kmeans(boston_scaled, k)$tot.withinss})

visualize the results

library(ggplot2)
qplot(x = 1:k_max, y = twcss, geom = 'line')

k-means clustering

km <-kmeans(boston_scaled, centers = 2)

plot the Boston dataset with clusters

pairs(boston_scaled, col = km$cluster)

ggpairs

library(GGally)
ggpairs(boston_scaled)

Super bonus

model_predictors <- dplyr::select(train, -crime)

check the dimensions

dim(model_predictors)
## [1] 404  13
dim(lda.fit$scaling)
## [1] 13  3

matrix multiplication

matrix_product <- as.matrix(model_predictors) %*% lda.fit$scaling
matrix_product <- as.data.frame(matrix_product)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:MASS':
## 
##     select
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

3D figure

plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type= 'scatter3d', mode='markers', color = train$crime )

First one needs to do k-means with 4 clusters to compare the methods.

km3D <-kmeans(boston_scaled, centers = 4)

3D figure

plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type= 'scatter3d', mode='markers', color = km3D$cluster[ind])